
#Load all required libraries here:----
library(readr)
library(dplyr)
library(ggplot2)
library(dplyr)
library(tibble)
library(randomForest)


# Load the diamonds.csv file 
diamonds <- read_csv("C:/Users/User/OneDrive/Desktop/R folder/assignment 2/diamonds.csv")%>%
select (-1)  #drop the first column
View(diamonds)


# Find the data type of each column
data_types <- sapply(diamonds, class)
data_types

#create a matrix of scatter plots using numeric columns
scatterplot<-pairs(~ carat + depth + table + price + x + y + z , data= diamonds, main="Simple Scatterplot Matrix")
scatterplot

#perform a multiple linear regression with price as the response and all other variables as the predictors
lm.diamond <- lm(price ~ ., data = diamonds)
summary(lm.diamond)

#Result
#Residual standard error: 1130 on 53916 degrees of freedom
#Multiple R-squared:  0.9198,	Adjusted R-squared:  0.9198 
#F-statistic: 2.688e+04 on 23 and 53916 DF,  p-value: < 2.2e-16



# There a relationship between the predictors and the response
#1. F-statistic: 2.688e+04 is very high
#2. p-value: < 2.2e-16 is very low
#3. Multiple R-squared:  0.9198 is high explaining approximately 91.98% of data in price is explained by the predictors.
#4. Adjusted R-squared:  0.9198 is same as Multiple R-squared explaining model is not overfitted due to unnecessary predictors
#5. Most of the predictors have p-values less than 0.05explaining significant associations with the price.

#y                9.609     19.333   0.497    0.619    
#z              -50.119     33.486  -1.497    0.134
#6 y and z appear NOT to have a statistically significant relationship to the response as p-values greater than 0.05

#carat        11256.978     48.628 231.494  < 2e-16 ***
#7 in base model if we add one unit of carat (increase from 0.21 to 1.21) then it predicts the price would increase 11,259, assuming all other variables remain constant


#Create model

lm.diamond <- lm(price ~ ., data = diamonds)       # Fit the model
predicted_price <- predict(lm.diamond, diamonds)   # prediction from model
#Create a tibble named y_and_yHat:
y_and_yHat <- tibble(                              # tibble created named y_and_yHat
  price = diamonds$price,
  predicted_price = predicted_price
)

print(y_and_yHat)


#Sort the tibble and create a scatter plot

y_and_yHat_sorted <- y_and_yHat %>%
  arrange(price)%>%                         #Sort using the price column (smallest to largest)
  ggplot() +                                #plot scatter plot
  geom_point(mapping = aes(x = price, y = predicted_price,color = "red"))+
  ggtitle("Scatter Plot of Price vs Predicted Price")+
  labs(x = "Price", y = "Predicted Price")
y_and_yHat_sorted


#Expand regression model to include carat^2

lm.diamond_poly <- lm(price ~ . + poly(carat,2), data = diamonds)  #adding carat^2 in exsisting model
summary(lm.diamond_poly)


#Report the residual standard error 

#Residual standard error_model linear model: 1118
#Residual standard error_model polynomial model: 1130
#linear model is better as lower Residual standard error

#Hyperparameter tuning

# read the file fold_v_diamond
fold_v_diamond <- readRDS("C:/Users/User/OneDrive/Desktop/R folder/assignment 2/fold_v_diamond.RData")
print(fold_v_diamond)

# Define the parameter grid
degrees <- c(1:5)

# set seed
set.seed(10)

# Initialize lists to store MAEs for each degree
degree_training_MAE <- c()
degree_testing_MAE <- c()

# Initialize lists to store average MAEs for each degree
degree_training_MAE_avg <- c()
degree_testing_MAE_avg <- c()

# Perform grid search with cross-validation
for (degree in degrees) {
  # Store MAEs for each fold
  fold_training_MAE <- c()
  fold_testing_MAE <- c()
  
  for (fold_counter in unique (fold_v_diamond)) {
    # Split the data into training and testing based on the current fold
    train_data <- diamonds[fold_v_diamond != fold_counter, ]
    test_data  <- diamonds[fold_v_diamond == fold_counter, ]
    
    # Train the model
    model <- lm(price ~ poly(carat, degree), data = train_data)
    
    # Predict on the train set
    y_train_pred <- predict(model, newdata = train_data)
    
    # Predict on the test set
    y_test_pred <- predict(model, newdata = test_data)
    
    # Calculate MAE for train set
    train_mae <- mean(abs(train_data$price - y_train_pred))
    
    # Calculate MAE for test set
    test_mae <- mean(abs(test_data$price - y_test_pred))
    
    # Store MAEs for this fold
    fold_training_MAE <- c(fold_training_MAE, train_mae)
    fold_testing_MAE <- c(fold_testing_MAE, test_mae)
    
    
  }
  
  
  # Store all MAEs for each degree
  degree_training_MAE[[paste("Degree", degree)]] <- fold_training_MAE
  degree_testing_MAE[[paste("Degree", degree)]] <- fold_testing_MAE
  
  # Store the average MAEs for this degree
  degree_training_MAE_avg[[paste("Degree", degree)]] <- mean(fold_training_MAE)
  degree_testing_MAE_avg[[paste("Degree", degree)]] <- mean(fold_testing_MAE)
}


print(degree_training_MAE)
print(degree_testing_MAE)
print(degree_training_MAE_avg)
print(degree_testing_MAE_avg)


#best model Degree 5 as it has lower avg testing MAE.


# Training and Testing MAE vs Polynomial Degree

data <- data.frame(                           # Create the data frame
  Degree = degrees,
  training_MAE = unlist(degree_training_MAE_avg),
  testing_MAE = unlist(degree_testing_MAE_avg)
)%>% ggplot() +       #plotting  polynomial degree on the x axis, and MAE on the y axis
  geom_line(aes(x = Degree, y = training_MAE, color = "Training MAE")) +   # Training MAE line
  geom_line(aes(x = Degree, y = testing_MAE, color = "Testing MAE")) +     # Testing MAE line
  ggtitle("Training and Testing MAE vs Polynomial Degree") +
  labs(x = "Polynomial Degree", y = "Mean Absolute Error (MAE)") +
  scale_color_manual(values = c("Training MAE" = "blue", "Testing MAE" = "red"))
data


# Load the iris dataset
wd <- iris
head(wd)

# Load the pre-defined folds
cv_folds <- readRDS("C:/Users/User/OneDrive/Desktop/R folder/assignment 2/folds_iris.RDS")


----------------------------------------------------------------------------------------
#Model 1: Random forest with mtry = 3, ntree = 100

# Set seed for reproducibility
set.seed(1)

# Initialize storage for accuracies
accuracies_model1 <- c()

# Loop through each fold
for (fold_counter in unique(cv_folds)) {
  # Split the data into training and testing based on the current fold
  train_data <- wd[cv_folds != fold_counter, ]
  test_data  <- wd[cv_folds == fold_counter, ]
  
  # Train the Random Forest model
  model1 <- randomForest(Species ~ ., data = train_data, mtry = 3, ntree = 100)
  
  # Make predictions on the test data
  predictions <- predict(model1, newdata = test_data)
  
  # Calculate accuracy for the current fold
  accuracy <- mean(predictions == test_data$Species)
  
  # Store the accuracy
  accuracies_model1 <- c(accuracies_model1, accuracy)
}
avg_accuracies_model1 <-mean(accuracies_model1)
print(accuracies_model1)
print( avg_accuracies_model1)
---------------------------------------------------------------------------------------
#Model 2: Random forest with mtry = 2, ntree = 100

# Set seed for reproducibility
set.seed(1)

# Initialize storage for accuracies
accuracies_model2 <- c()

# Loop through each fold
for (fold_counter in unique(cv_folds)) {
  # Split the data into training and testing based on the current fold
  train_data <- wd[cv_folds != fold_counter, ]
  test_data  <- wd[cv_folds == fold_counter, ]
  
  # Train the Random Forest model
  model2 <- randomForest(Species ~ ., data = train_data, mtry = 2, ntree = 100)
  
  # Make predictions on the test data
  predictions <- predict(model2, newdata = test_data)
  
  # Calculate accuracy for the current fold
  accuracy <- mean(predictions == test_data$Species)
  
  # Store the accuracy
  accuracies_model2 <- c(accuracies_model2, accuracy)
}

avg_accuracies_model2 <-mean(accuracies_model2)
print(accuracies_model2)
print( avg_accuracies_model2)
---------------------------------------------------------------------------------------

#Model 3: Random forest with mtry = 3, ntree = 500
  
# Set seed for reproducibility
set.seed(1)

# Initialize storage for accuracies
accuracies_model3 <- c()

# Loop through each fold
for (fold_counter in unique(cv_folds)) {
  # Split the data into training and testing based on the current fold
  train_data <- wd[cv_folds != fold_counter, ]
  test_data  <- wd[cv_folds == fold_counter, ]
  
  # Train the Random Forest model
  model3 <- randomForest(Species ~ ., data = train_data, mtry = 3, ntree = 500)
  
  # Make predictions on the test data
  predictions <- predict(model3, newdata = test_data)
  
  # Calculate accuracy for the current fold
  accuracy <- mean(predictions == test_data$Species)
  
  # Store the accuracy
  accuracies_model3 <- c(accuracies_model3, accuracy)
}

avg_accuracies_model3 <-mean(accuracies_model3)
print(accuracies_model3)
print( avg_accuracies_model3)

-------------------------------------------------------------------------------------
#Model 4: Random forest with mtry = 2, ntree = 500
  
# Set seed for reproducibility
set.seed(1)

# Initialize storage for accuracies
accuracies_model4 <- c()

# Loop through each fold
for (fold_counter in unique(cv_folds)) {
  # Split the data into training and testing based on the current fold
  train_data <- wd[cv_folds != fold_counter, ]
  test_data  <- wd[cv_folds == fold_counter, ]
  
  # Train the Random Forest model
  model4 <- randomForest(Species ~ ., data = train_data, mtry = 2, ntree = 500)
  
  # Make predictions on the test data
  predictions <- predict(model4, newdata = test_data)
  
  # Calculate accuracy for the current fold
  accuracy <- mean(predictions == test_data$Species)
  
  # Store the accuracy
  accuracies_model4 <- c(accuracies_model4, accuracy)
}

avg_accuracies_model4 <-mean(accuracies_model4)
print(accuracies_model4)
print( avg_accuracies_model4)


data <- data.frame(
  Model = c(1,2,3,4),
  accuracy = c(0.9595105,0.9439549,0.9595105,0.9539549)
  )

print(data)

#Model  accuracy
#1     1 0.9595105
#2     2 0.9439549
#3     3 0.9595105
#4     4 0.9539549
# Model no 1 and 3 are best due to higher accuracy

















